---
title: "TET3_in_vivo"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library(readxl)
library(tidyr)
library(purrr)
library(dplyr)
library(stringr)
library(seqLogo)
options(digits=12)
```

the function to plot the logo

```{r}
vivologo <- function(model) {
  mycoeff <- coef(summary(model))
  #
  V1_A <-1 
  V1_C <-exp(mycoeff["V1C","Estimate"])
  V1_G <-exp(mycoeff["V1G","Estimate"])
  V1_T <-exp(mycoeff["V1T","Estimate"])
  unV1 <-c(V1_A,V1_C,V1_G, V1_T)
  fact <- sum(c(V1_A,V1_C,V1_G, V1_T))
  myV1 <- unV1/fact
  #
  V2_A <-1 
  V2_C <-exp(mycoeff["V2C","Estimate"])
  V2_G <-exp(mycoeff["V2G","Estimate"])
  V2_T <-exp(mycoeff["V2T","Estimate"])
  unV2 <-c(V2_A,V2_C,V2_G, V2_T)
  fact <- sum(c(V2_A,V2_C,V2_G, V2_T))
  myV2 <- unV2/fact
  #
  myV3 <- c(0.0,1.0,0.0,0.0)
  #
  myV4 <- c(0.0,0.0,1.0,0.0)
  #
  V5_A <-1 
  V5_C <-exp(mycoeff["V5C","Estimate"])
  V5_G <-exp(mycoeff["V5G","Estimate"])
  V5_T <-exp(mycoeff["V5T","Estimate"])
  unV5 <-c(V5_A,V5_C,V5_G, V5_T)
  fact <- sum(c(V5_A,V5_C,V5_G, V5_T))
  myV5 <- unV5/fact
  #
  V6_A <-1 
  V6_C <-exp(mycoeff["V6C","Estimate"])
  V6_G <-exp(mycoeff["V6G","Estimate"])
  V6_T <-exp(mycoeff["V6T","Estimate"])
  unV6 <-c(V6_A,V6_C,V6_G, V6_T)
  fact <- sum(c(V6_A,V6_C,V6_G, V6_T))
  myV6 <- unV6/fact
  #
  logo_input=data.frame(myV1,myV2,myV3,myV4,myV5,myV6)
  p <- makePWM(logo_input)
  seqLogo(p, ic.scale=FALSE)
}
```

```{r}
filename <-"../Datasets/TET-TKO-TET3-expression.xlsx"
sheet <- "ALLDATA_R"
range1 <- "A3:C259"
```

```{r}
TET <- read_excel(filename, sheet = sheet, range = range1) %>% select(-c('revcom'))
head(TET)
```

There is a rounding problem that I do not understand that will have to be solved.

```{r}
seqmotifs<-cbind(str_split_fixed(TET$Motif,"",6),TET$`72 hr loss`)
seqmotifs<-data.frame(seqmotifs)
colnames(seqmotifs)<-c("V1","V2","V3","V4","V5","V6","velocity")
seqmotifs$velocity<-as.double(as.character(seqmotifs$velocity))
seqmotifs$logvelocity <-log(seqmotifs$velocity)
head(seqmotifs)
```


```{r}
model<-lm(logvelocity~V1+V2+V5+V6,data=seqmotifs)
summary(model)
```

Notice the very high significance values!

```{r}
plot(seqmotifs$logvelocity,as.vector(model$fitted.values),
     main="TET model with 4 positions",
     xlab="log velocity(experiment)",
     ylab="log velocity(prediction)"
     )
abline(a=0,b=1)
```
```{r}
cor(seqmotifs$logvelocity,as.vector(model$fitted.values))
```
```{r}
vivologo(model)
```

<h2>use only half the data for the model, predict the other half</h2>

```{r}
ind <- sample(c(TRUE, FALSE), nrow(seqmotifs), replace=TRUE, prob=c(0.5, 0.5))
seqmotifs_2a <-seqmotifs[ind,]
seqmotifs_2b <-seqmotifs[!ind,]
```
  
```{r}
model_sample <-lm(logvelocity~V1+V2+V5+V6,data=seqmotifs_2a)
summary(model_sample)
```

```{r}
vivologo(model_sample)
```

check how well the model fits the data that have been used for its calculation

```{r}
plot(seqmotifs_2a$logvelocity,predict(model_sample,seqmotifs_2a),
     main="TET model with 4 positions (fitting)",
     xlab="log velocity(experiment)",
     ylab="log velocity(prediction)")
abline(a=0,b=1)
```
```{r}
cor(seqmotifs_2a$logvelocity,predict(model_sample,seqmotifs_2a))
```

```{r}
plot(seqmotifs_2b$logvelocity,predict(model_sample,seqmotifs_2b),
     main="TET model with 4 positions (predicting)",
     xlab="log velocity(experiment)",
     ylab="log velocity(prediction)")
abline(a=0,b=1)
```
```{r}
cor(seqmotifs_2b$logvelocity,predict(model_sample,seqmotifs_2b))
```
